Comparison of distributing or compressing levels


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#%pylab inline
pd.__version__ # need 0.14.0 for multiindex slicing


Out[1]:
'0.14.1'

Read files


In [2]:
oc = pd.read_table("overall_statistics_3d.txt").set_index(["K","M","STATISTIC"])["VALUE"].unstack()
vc = pd.read_table("variable_statistics_3d.txt").set_index(["K","M","STATISTIC","VARIABLE"])["VALUE"].unstack().unstack()

In [3]:
od = pd.read_table("overall_statistics_alldistributed_3d.txt").set_index(["K","M","STATISTIC"])["VALUE"].unstack()
vd = pd.read_table("variable_statistics_alldistributed_3d.txt").set_index(["K","M","STATISTIC","VARIABLE"])["VALUE"].unstack().unstack()

Add fixed compression ratios


In [4]:
N_c = 88*30 # for 3D variables, vertical stacking
N_d = 48602 # for all variables, vertical stacking
original_size = N_c * N_d
compressed_size = lambda K, M: N_d + N_c * K + N_d * M + N_c * K * M
oc["compression_ratio_fixed"] = compressed_size(np.array(oc.index.get_level_values("K")),np.array(oc.index.get_level_values("M"))) / original_size
#oc.loc[:,"compression_ratio_fixed"].unstack("K")

In [5]:
N_c = 88       # for 3D variables, vertical stacking (ncol & lev distributed)
N_d = 30*48602 # for 3D variables, vertical stacking (ncol & lev distributed)
original_size = N_c * N_d
compressed_size = lambda K, M: N_d + N_c * K + N_d * M + N_c * K * M
od["compression_ratio_fixed"] = compressed_size(np.array(od.index.get_level_values("K")),np.array(od.index.get_level_values("M"))) / original_size
#od.loc[:,"compression_ratio_fixed"].unstack("K")

Compare errors and compression ratio


In [9]:
# error vs compression ratio, one line per K
grouped_c = vc.loc(axis=0)[6:10,:].mean(axis=1,level="STATISTIC").join(oc).reset_index().groupby("K")
grouped_d = vd.loc(axis=0)[6:10,:].mean(axis=1,level="STATISTIC").join(od).reset_index().groupby("K")
for key,grp in grouped_c:
    plt.plot(grp["compression_ratio_fixed"],grp["rms_error"],"-", label="K = " + str(key) + " (comp.)")
for key,grp in grouped_d:
    plt.plot(grp["compression_ratio_fixed"],grp["rms_error"],":", label="K = " + str(key) + " (dist.)")
plt.legend()
plt.xlabel("compression ratio")
plt.ylabel("mean rms error")
plt.show()
#plt.title("error vs compression ratio, by K")
#plt.xlim((0.08,0.11))
#plt.ylim((0.001,0.002))

In [7]:
for key,grp in grouped_c:
    plt.plot(grp["compression_ratio_fixed"],grp["L_final"],"-", label="K = " + str(key) + " (comp.)")
for key,grp in grouped_d:
    plt.plot(grp["compression_ratio_fixed"],grp["L_final"],":", label="K = " + str(key) + " (dist.)")
plt.legend()
plt.xlabel("compression ratio")
plt.ylabel("mean rms error")
#plt.title("error vs compression ratio, by K")
#plt.xlim((0.08,0.11))
#plt.ylim((0.001,0.002))


Out[7]:
<matplotlib.text.Text at 0x7f40479aed68>